from utils import Activities, Users
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
pd.options.plotting.backend = "plotly"
import plotly.figure_factory as ff
import numpy as np
from config_vars import month_map, month_order
pd.options.display.max_columns = None
pd.options.display.max_rows = None
data_folder = "../data/"
#plotly style:
line_traces = dict(mode='lines+markers',line_shape='spline',line_smoothing=1,marker_size=10,marker_opacity=0.9)
users = Users(f"{data_folder}users.csv")
students = users.df.query("student == 1")
supervisors = users.df.query("supervisor == 1")
teachers = users.df.query("teacher == 1")
activities = Activities(f"{data_folder}activities.csv")
activities.drop_over_year() # TOASK: a Nicola utenti oltre l'anno
activities.map_month(month_map,month_order) # map and sort months
activities.save_raw() # create a copy of non cleaned dataframe
activities.fillna() # fill some columns with 0 instead of NaN
activities.year_to_cat()
feedbacks = pd.read_csv(f"{data_folder}activities_feedbacks_info.csv")
feedbacks = feedbacks.loc[feedbacks['ac_activity'].isin(activities.df['ac_activity']),:]
df_months = pd.read_csv(f"{data_folder}months.csv").fillna(0)
df_months.replace({'month':month_map},inplace=True)
df_months['month_order'] = df_months['month'].map(dict(zip(month_order,range(1,13))))
df_months.sort_values(by=['month_order','activity_school_year'],inplace=True)
y_users = df_months['n_users_per_year']
df_months['norm_avg_n_user_recipes'] = df_months['n_recipes'].divide(y_users)
df_months['norm_avg_n_user_experiences'] = df_months['n_experiences'].divide(y_users)
df_months['norm_avg_n_activities'] = df_months['n_activities'].divide(y_users)
logins = pd.read_csv(f"{data_folder}log_logins.csv")
n_supervisors = len(supervisors)
n_students = len(students)
n_teachers = len(users.df.query("teacher == 1"))
print(f"There are {n_students} students, {n_supervisors} supervisors, and {n_teachers} teachers")
activities.plot_dashboard()
hist_data = activities.df.groupby(by=['creation_month','activity_school_year','start_year'],sort=False).size().reset_index(name="count")
hist_data = activities.sort_month(hist_data,'creation_month',['activity_school_year','start_year'])
fig = hist_data.plot(x="creation_month", y=["count"], facet_col='activity_school_year', color="start_year")
fig.update_layout(
title="Number of activities per start year",
title_x=0.5,
yaxis_title="# of activities",
legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)
fig.show()
fig = students.sort_values(by=["start_year"]).plot.box(
x="start_year",y="n_activities", color="start_year",
hover_data =["us_user","n_activities","start_year"]
)
fig.update_layout(
title="Number of activities per student per start year",
title_x=0.5,
yaxis_title="Activities per student",
)
fig.update_yaxes(range=[0, 600])
fig.show()
hist_data = activities.df.groupby(by=['activity_school_year','start_year','us_user'],sort=False).size().reset_index(name="count")
active_users_per_year = hist_data.query("count > 0").drop("us_user",axis=1).groupby(by=['activity_school_year','start_year'],sort=False).size().reset_index(name="count").sort_values(by=["start_year","activity_school_year"])
fig = active_users_per_year.plot(x="start_year", y="count", facet_col='activity_school_year')
fig.update_layout(
title="Number of active users per year",
title_x=0.5,
yaxis_title="# of actitve users",
legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45, type="category")
fig.update_traces(line_traces)
fig.show()
hist_data = feedbacks.groupby(by=['activity_school_year','start_year'],sort=False).size().reset_index(name="count")
fig = hist_data.plot(x="start_year", y=["count"], facet_col='activity_school_year')
fig.update_layout(
title="Number of feedbacks per start year",
title_x=0.5,
yaxis_title="# of feedbacks",
showlegend=False,
)
fig.update_xaxes(title=dict(text=""),tickangle=45,type="category")
fig.update_traces(line_traces)
fig.show()
feedbacks['response_year'] = pd.DatetimeIndex(feedbacks['response_date']).year
data_hist = feedbacks.groupby(by=['recipient','response_year']).size().reset_index(name="count")
fig = data_hist.sort_values(by=["response_year"]).plot.box(
x="response_year",y="count", color="response_year",
hover_data =["recipient","response_year","count"]
)
fig.update_layout(
title="Number of feedbacks per supervisor",
title_x=0.5,
yaxis_title="Feedbacks per supervisor",
)
fig.show()
fig = supervisors.sort_values(by=["start_year"]).plot.box(
x="start_year",y="n_feedback_responses", color="start_year",
hover_data =["us_user","user_name","n_feedback_responses","start_year"]
)
fig.update_layout(
title="Number of feedbacks per supervisor per start year",
title_x=0.5,
yaxis_title="Feedbacks per supervisor",
)
fig.show()
fig = students.sort_values(by=["start_year"]).plot.box(
x="start_year",y="n_feedback_requests", color="start_year",
hover_data =["us_user","user_name","n_feedback_requests","start_year"]
)
fig.update_layout(
title="Number of feedback requests per student",
title_x=0.5,
yaxis_title="Requests per student",
)
fig.show()
fig = activities.df.sort_values(by=["start_year","activity_school_year"]).plot.box(
x="activity_school_year",y="n_images",
color="start_year",
hover_data =["ac_activity","ac_title","activity_total_length","start_year"]
)
fig.update_layout(
title="Number of images per activity",
title_x=0.5,
)
fig.show()
fig = activities.df.sort_values(by=["start_year","activity_school_year"]).plot.box(
x="activity_school_year",y="activity_total_length",
color="start_year",
hover_data =["ac_activity","ac_title","activity_total_length","start_year"]
)
fig.update_layout(
title="Total lenghts",
title_x=0.5,
)
fig.update_yaxes(range=[0, 600])
fig.show()
fig = df_months.plot(x="month", y=["n_activities"],facet_col='activity_school_year')
fig.update_layout(
title="Total number of activities per month",
title_x=0.5,
yaxis_title="# activities",
legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)
fig.show()
fig = df_months.plot(x="month", y=["norm_avg_n_activities"],facet_col='activity_school_year')
fig.update_layout(
title="Number of activities per users per month",
title_x=0.5,
yaxis_title="average # of activities",
legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)
fig.show()
fig = df_months.plot(x="month", y=["n_edits"],facet_col='activity_school_year')
fig.update_layout(
title="Total number of activities per month",
title_x=0.5,
yaxis_title="# activities",
legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)
fig.show()
fig = activities.sort_month(
activities.df[['last_edit_month','activity_school_year','in_curriculum']],
'last_edit_month',['activity_school_year']
).hist(x="last_edit_month", y="in_curriculum",
facet_col='activity_school_year',
)
fig.update_layout(
title="# of activities in curriculum per month",
title_x=0.5,
yaxis_title="# activities in curriculum",
legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.show()
feedbacks['response_month'] = pd.DatetimeIndex(feedbacks['response_date']).month
feedbacks.replace({'response_month':month_map},inplace=True)
feedbacks['request_month'] = pd.DatetimeIndex(feedbacks['request_date']).month
feedbacks.replace({'request_month':month_map},inplace=True)
hist_requests = activities.sort_month(
feedbacks.groupby(['activity_school_year','request_month']).size().reset_index(name="count"),
'request_month',['activity_school_year']
).rename(columns={'request_month':'month'}).assign(type = 'request')
hist_responses = activities.sort_month(
feedbacks.groupby(['activity_school_year','response_month']).size().reset_index(name="count"),
'response_month',['activity_school_year']
).rename(columns={'response_month':'month'}).assign(type = 'response')
hist_data = hist_requests.append(hist_responses)
fig = hist_data.plot(
x="month", y=["count"],facet_col='activity_school_year', color="type"
)
fig.update_layout(
title="Total number of feedback requests and responses per month",
title_x=0.5,
yaxis_title="count",
legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)
fig.show()
feedbacks['has_response'] = feedbacks['response_date'].notnull()
a = feedbacks.groupby(["request_month","activity_school_year"]).sum()['has_response'].div(
feedbacks.groupby(["request_month","activity_school_year"]).size()
).reset_index(name="ratio")
total = feedbacks.groupby(["request_month","activity_school_year"]).size()
fig = activities.sort_month(
feedbacks.groupby(["request_month","activity_school_year"]).sum()['has_response'].div(
feedbacks.groupby(["request_month","activity_school_year"]).size()
).reset_index(name="ratio").assign(
total = feedbacks.groupby(["request_month","activity_school_year"]).size().values.astype('int64')
),'request_month',['activity_school_year']
).plot.scatter(
x="request_month", y=["ratio"],facet_col='activity_school_year',
size="total",
color="total",
size_max=60
)
fig.update_layout(
title="Ratio responses/requests",
title_x=0.5,
yaxis_title="ratio",
showlegend = False
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces,line=dict(color="gray"))
fig.show()
tot_req = feedbacks.groupby(['recipient']).size()
tot_resp = feedbacks.query("has_response == True").groupby(['recipient']).size()
tot_resp = (tot_req-tot_req+b).fillna(0)
ratio = tot_resp.div(tot_req).reset_index(name="ratio").rename(columns={'recipient':'us_user'})
ratio = ratio.merge(supervisors[['us_user','start_year','user_name']],on="us_user",how="right").dropna()
tot_req = tot_req.reset_index(name="received").rename(columns={'recipient':'us_user'})
ratio = ratio.merge(tot_req,on="us_user",how="left")
ratio['responsed'] = ratio['ratio']*ratio['received']
ratio.sort_values(by="ratio",inplace=True)
fig = ratio.plot.scatter(x="us_user",y="ratio",color="received")
fig.update_layout(
title="Ratio responses/requests per each supervisor",
title_x=0.5,
xaxis_title='supervisor',
xaxis_type='category'
)
fig.update_traces(mode='markers',opacity=0.8,)
fig.add_trace(ratio.plot.bar(x="us_user",y="ratio",color="received").data[0])
fig.update_xaxes(showticklabels=False)
fig.show()
ratio['norm'] = ratio['responsed']*ratio['ratio']**2
bests=ratio.sort_values(by=["norm"],ascending=False).head(10)
bests
df_feedbacks_info = feedbacks
no_answer = df_feedbacks_info['student_grade'].isnull().sum()
print(f'There are {no_answer}/{len(df_feedbacks_info)} without student grade ({round(no_answer/len(df_feedbacks_info),4)*100}%)')
no_answer = df_feedbacks_info['response_date'].isnull().sum()
print(f'There are {no_answer}/{len(df_feedbacks_info)} without answer ({round(no_answer/len(df_feedbacks_info),3)*100}%)')
no_answer = df_feedbacks_info['supervisor_grade'].isnull().sum()
print(f'There are {no_answer}/{len(df_feedbacks_info)} without supervisor grade ({round(no_answer/len(df_feedbacks_info),4)*100}%)')
response_no_grade = (df_feedbacks_info['response_date'].notnull() & df_feedbacks_info['supervisor_grade'].isnull()).sum()
grade_no_response = (df_feedbacks_info['response_date'].isnull() & df_feedbacks_info['supervisor_grade'].notnull()).sum()
print(f'There are {response_no_grade} responses without grades')
grade_no_response #TODO: VERIFICARE QUESTO!
df_feedbacks_info.dropna(inplace=True)
df_feedbacks_info['delay_days'] = df_feedbacks_info['delay_hours']/24
df_hist = df_feedbacks_info.groupby(["activity_school_year","times_before_answer"]).count()[['ac_activity']].reset_index().rename(columns={'ac_activity': 'count'})
fig = df_hist.plot.hist(x="times_before_answer", y='count',
color = "activity_school_year", histnorm='percent' )
fig.update_layout(
barmode='group',
xaxis_type='category',
title="Requests before a feedback",
title_x=0.5,
yaxis_title="Percent",
xaxis_title="Number of requests before the response",
coloraxis_showscale=False,
)
fig.show()
fig = df_feedbacks_info[df_feedbacks_info['delay_days']<100].plot.hist(x="delay_days", nbins=30,
color = "activity_school_year" )
fig.update_layout(
barmode='group',
title="Histogram: days before an answer",
title_x=0.5,
yaxis_title="Count",
xaxis_title="days before an answer",
coloraxis_showscale=False,
)
fig.show()
fig = df_feedbacks_info[df_feedbacks_info['delay_days']<100].plot.hist(x="delay_days", nbins=30,
color = "activity_school_year", histnorm='percent' )
fig.update_layout(
barmode='group',
title="Histogram: normalized number of days before an answer",
title_x=0.5,
yaxis_title="Percent",
xaxis_title="days before an answer",
coloraxis_showscale=False,
)
fig.show()
fig = df_feedbacks_info[df_feedbacks_info['edits_after']<5].plot.hist(x="edits_after", nbins=5,
color = "activity_school_year", histnorm='percent' )
fig.update_layout(
barmode='group',
title="Histogram: normalized number of edits after a feedback",
title_x=0.5,
yaxis_title="Percent",
xaxis_title="# of edits after an answer per feedback request",
)
fig.show()
df_feedbacks_info['has_edit_after'] = (df_feedbacks_info['edits_after']>0).astype(int)
df_count = df_feedbacks_info.groupby(["activity_school_year","supervisor_grade","has_edit_after"]).size().reset_index(name='count')
c = (pd.core.reshape.util.cartesian_product([df_count['activity_school_year'].unique(),df_count['supervisor_grade'].unique(),df_count['has_edit_after'].unique()]))
c = pd.DataFrame(dict(activity_school_year=c[0],supervisor_grade=c[1],has_edit_after=c[2]))
c = c.set_index(["activity_school_year","supervisor_grade","has_edit_after"]).join(df_count.set_index(["activity_school_year","supervisor_grade","has_edit_after"])).reset_index().sort_values(by=['activity_school_year','supervisor_grade'])
c.fillna(0,inplace=True)
df_feedback_edits = c[c['has_edit_after']==True].drop("has_edit_after",axis=1).rename(columns={'count':'edit'})
df_feedback_edits['no_edit'] = c[c['has_edit_after']==False]['count'].tolist()
df_feedback_edits['ratio'] = df_feedback_edits['edit'].div(df_feedback_edits['no_edit']+df_feedback_edits['edit']).fillna(0)
fig = df_feedback_edits.plot(x="supervisor_grade", y=["edit"],
line_group='activity_school_year', color='activity_school_year')
fig.update_layout(
title="# activities that have been edit after a feedback per grade",
title_x=0.5,
yaxis_title="# activities",
xaxis_title="supervisor grade",
)
fig.update_traces(line_traces)
fig.show()
dayofweek_map = {1:'Sun',2:'Mon',3:'Tue',4:'Wed',5:'Thu',6:'Fri',7:'Sat'}
logins_students = logins.merge(students,on="us_user",how="inner")
logins_supervisors = logins.merge(supervisors,on="us_user",how="inner")
logins_teachers = logins.merge(teachers,on="us_user",how="inner")
logins['date'] = logins['date'].astype('datetime64')
fig = logins_students.groupby(['dayofweek','hour']).size().reset_index(name="count").replace(
{'dayofweek':dayofweek_map}
).plot(
x="hour", y=["count"],facet_col='dayofweek', color="dayofweek"
)
fig.update_layout(
title="Apprentices logins",
title_x=0.5,
yaxis_title="# logins",
legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""))
fig.update_traces(line_traces)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))
fig.show()
fig = logins_supervisors.groupby(['dayofweek','hour']).size().reset_index(name="count").replace(
{'dayofweek':dayofweek_map}
).plot(
x="hour", y=["count"],facet_col='dayofweek', color="dayofweek"
)
fig.update_layout(
title="Supervisors logins",
title_x=0.5,
yaxis_title="# logins",
legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""))
fig.update_traces(line_traces)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))
fig.show()
fig = logins_teachers.groupby(['dayofweek','hour']).size().reset_index(name="count").replace(
{'dayofweek':dayofweek_map}
).plot(
x="hour", y=["count"],facet_col='dayofweek', color="dayofweek"
)
fig.update_layout(
title="Teachers logins",
title_x=0.5,
yaxis_title="# logins",
legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""))
fig.update_traces(line_traces)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))
fig.show()